Basa data:
Balsamorhiza sagittata
Balsamorhiza hookeri
BAHO3 x BASA hybrid
Lupinus wyethii (Outgroup)
Located in:
Raw data located in:
/archive/parchman_lab/rawdata_to_backup/BASA_CHDO_rawNOVASEQ/Library2-BASA_S2_L002_R1_001.fastq.gz
Cleaned and demultiplexed .fastq files by individual:
/working/tfaske/balsam/demult/fastq (BASA)
/working/tfaske/balsam/demult/outgroup_fastq (outgroup)
Generating the reference with dDocent:
mkdir refOpt (place here 4 individuals per pop)
./ReferenceOpt.sh 4 10 4 10 SE 5 0.95 0.99 0.005
#####
R
library(ggplot2)
data.table <- read.table("kopt.data", header = FALSE, col.names= c("k1","k2","Similarity", "Contigs"))
data.table$K1K2 <- paste(data.table$k1, data.table$k2, sep=",")
df=data.frame(data.table)
df$K1K2 <- as.factor(df$K1K2)
p <- ggplot(df, aes(x=Similarity, y=Contigs, group=K1K2)) + scale_x_continuous(breaks=seq(0.8,0.98,0.01)) + geom_line(aes(colour = K1K2))
p
ggsave("kvalues.pdf",p,height=8,width = 10,units = 'in')
#####
Run dDocent on this subset with the correct assembly parameters (skipping mapping and snp calling)
./RefMapOpt.sh 4 6 4 6 0.975 SE 10
./compress.sh (compress intermediate files)
Move the reads and reference files to the main directory
Run dDocent on the full data set, skipping trimming, assembly, snp calling
SNP calling:
Run ./bwa_sort.sh
INDS=($(for i in /working/mascaro/basa/outgroup_2/*.F.fq.gz; do echo $(basename ${i%.F.fq.gz*}); done))
module load bwa/0.7.8
module load samtools/1.3
for IND in ${INDS[@]};
do
# declare variables
REF=/working/mascaro/basa/outgroup_2/reference.fasta
FORWARD=/working/mascaro/basa/outgroup_2/${IND}.F.fq.gz
OUTPUT=/working/mascaro/basa/outgroup_2/assembly/${IND}_sort.bam
# then align and sort
echo "Aligning $IND with bwa"
bwa mem -M -t 10 $REF $FORWARD \
| samtools view -b | \
samtools sort -T ${IND} > $OUTPUT
done
###bcftools
Run ./bcftools.sh
REF=/working/mascaro/basa/outgroup_2/reference.fasta
module load bcftools/1.9
bcftools mpileup -a AD,DP,SP -Ou -f $REF \
./assembly/*_sort.bam | bcftools call -f GQ,GP \
-mO z -o ./basa.vcf.gz
Filtering:
####
#The obtained vcf file:
#Final.recode.vcf:
# Statistics with Vcftools:
vcftools --gzvcf basa.vcf.gz --site-quality --out /working/mascaro/basa/outgroup_2/filter/quality
vcftools --gzvcf basa.vcf.gz --freq2 --out /working/mascaro/basa/outgroup_2/filter --max-alleles 2
vcftools --gzvcf basa.vcf.gz --depth --out /working/mascaro/basa/outgroup_2/filter/meandepthind
vcftools --gzvcf basa.vcf.gz --site-mean-depth --out /working/mascaro/basa/outgroup_2/filter/meandepsite
vcftools --gzvcf basa.vcf.gz --missing-indv --out /working/mascaro/basa/outgroup_2/filter/missing
vcftools --gzvcf basa.vcf.gz --missing-site --out /working/mascaro/basa/outgroup_2/filter/missingsite
##### Examining statistics in R
R
library(ggplot2)
library(tidyverse)
var_qual <- read_delim("quality.lqual", delim = "\t",col_names = c("chr", "pos", "qual"), skip = 1)
a <- ggplot(var_qual, aes(qual)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
ggsave("quality.pdf",a,height=8,width = 10,units = 'in')
var_depth <- read_delim("meandepthind.idepth", delim = "\t", col_names = c("chr", "pos", "mean_depth", "var_depth"), skip =1)
a <- ggplot(var_depth, aes(mean_depth)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
summary(var_depth$mean_depth)
a + theme_light() + xlim(0, 100)
ggsave("meandepth_ind.pdf",a,height=8,width = 10,units = 'in')
var_miss <- read_delim("missingsite.lmiss", delim = "\t",col_names = c("chr", "pos", "nchr", "nfiltered", "nmiss","fmiss"), skip = 1)
a <- ggplot(var_miss, aes(fmiss)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
summary(var_miss$fmiss)
ggsave("missing.pdf",a,height=8,width = 10,units = 'in')
ind_depth <- read_delim("meandepsite.ldepth.mean", delim = "\t", col_names = c("ind", "nsites", "depth"), skip =1)
a <- ggplot(ind_depth, aes(depth)) + geom_histogram(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
ggsave("meandepth_site.pdf",a,height=8,width = 10,units = 'in')
ind_miss <- read_delim("missing.imiss", delim = "\t",col_names = c("ind", "ndata", "nfiltered", "nmiss","fmiss"), skip = 1)
a <- ggplot(ind_miss, aes(fmiss)) + geom_histogram(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
ggsave("missing.pdf",a,height=8,width = 10,units = 'in')
#######
# Filtering:
vcftools --vcf basa.vcf.gz --maf 0.05 --max-missing 0.6 --minQ 30 --min-meanDP 5 --max-meanDP 80 --minDP 5 --maxDP 80 --recode --out basa_filtered
awk '$5 > 0.5' missing.imiss | cut -f1 > lowDP.indv
vcftools --vcf basa_filtered.vcf.recode.vcf --remove lowDP.indv --recode --recode-INFO-all --out variants_clean
vcftools --vcf variants_clean.recode.vcf --out basa_filtered_final --recode --remove-filtered-all
# Statistics with Vcftools:
vcftools --gzvcf basa_filtered_final.recode.vcf --site-quality --out quality
vcftools --gzvcf basa_filtered_final.recode.vcf --freq2 --out $OUT --max-alleles 2
vcftools --gzvcf basa_filtered_final.recode.vcf --depth --out meandepthind
vcftools --gzvcf basa_filtered_final.recode.vcf --site-mean-depth --out meandepsite
vcftools --gzvcf basa_filtered_final.recode.vcff --missing-indv --out missing
vcftools --gzvcf basa_filtered_final.recode.vcf --missing-site --out missingsite
# Filtering again:
vcftools --vcf basa_filtered_final.recode.vcf --maf 0.05 --max-missing 0.75 --m
inQ 30 --min-meanDP 10 --max-meanDP 80 --minDP 10 --maxDP 80 --recode --out basa_final2
# Statistics with Vcftools:
vcftools --gzvcf basa_filtered2.recode.vcf --site-quality --out quality
vcftools --gzvcf basa_filtered2.recode.vcf --freq2 --out $OUT --max-alleles 2
vcftools --gzvcf basa_filtered2.recode.vcf --depth --out meandepthind
vcftools --gzvcf basa_filtered2.recode.vcf --site-mean-depth --out meandepsite
vcftools --gzvcf basa_filtered2.recode.vcf --missing-indv --out missing
vcftools --gzvcf basa_filtered2.recode.vcf --missing-site --out missingsite
After filtering: 27621 loci and 379 individuals
Genetic diversity statistics:
Overall, excess
of heterozygotes (Ho>He; negative Fis values), high nucleotide
diversity, and low Fst values. AMOVA showing only 22 % of the variance
explained by population.
library(vcfR)
library("adegenet")
library("hierfstat")
library("pegas")
basa.vcf <- read.vcfR("basa_final.vcf")
basa.vcf
##Fis, Fst
my_genind <- vcfR2genind(basa.vcf)
x<- my_genind
pops <- as.factor(c(pops))
#Population specific Fis:
myData.hfstat <- genind2hierfstat(my_genind, pop = pops)
basicstat <- basic.stats(myData.hfstat, diploid = TRUE, digits = 4)
basicstat$Fis
write.csv(basicstat$Fis, "Fis.csv")
##Bootstrapping over loci of population's Fis
boot.ppfis(myData.hfstat)
#Nei's Pairwise FSTs:
x <- genet.dist(myData.hfstat,diploid=TRUE,method="Ds")# Nei’s standard genetic distance
fst <- as.matrix(x)
write.table(fst, "Fst.txt")
##Bootstrapping over loci of pairwise Fst
#boot.ppfst(myData.hfstat)
basicstat$Ho #observed
write.csv(basicstat$Ho, "HO.csv")
basicstat$Hs #expected
write.csv(basicstat$Hs, "Hs.csv")
basicstat
###########################################
##vcftools Pi and TajimaD
#!/bin/sh
# .vcf file
# .pop file (unique names of pops, one per line)
# .map file (individual to population mapping file — 2 columns)
#cat basa.pop | while read line;
#do
#grep "$line" basa.map > $line.pop
#done
#for p in *.pop
#do
#vcftools --vcf basa_final.vcf --keep $p --site-pi --out $p
#done
#for p in *.pop
#do
#vcftools --vcf basa_final.vcf --keep $p --TajimaD 100 --out $p
#done
#AMOVA:
library(vcfR)
library("adegenet")
library("hierfstat")
library("pegas")
library("poppr")
library("magrittr")
library(ape)
# vcf to genind
BASA.VCF <- read.vcfR("basa.vcf")
my_genind <- vcfR2genind(BASA.VCF)
### Give a data set with stratification (individual, populations, subpopulations..)
file.hier = read.table("ind_pop.txt", header=T)
strata(my_genind) = file.hier
my_genind
## amova with stratifications
amova.res.95 = poppr.amova(acth.hier.strat, ~pop/subpop, cutoff=0.95)
amova.res.95
write.table(amova.res.95$results, sep = ",", file = "results_amova.csv")
write.table(amova.res.95$componentsofcovariance, sep = ",", file = "Components_covariance.csv")
write.table(amova.res.95$statphi, sep = ",", file = "phi.csv")
## To test if populations are significantly different
amova.test.95 = randtest(amova.res.95)
amova.test.95
| Pop | He | Ho | Fis | π | TajimaD |
|---|---|---|---|---|---|
| TM | 0.284 | 0.428 | -0.385 | 0.241 | 0.313 |
| KB | 0.288 | 0.430 | -0.357 | 0.239 | 0.339 |
| CC | 0.291 | 0.432 | -0.354 | 0.246 | 0.275 |
| KA | 0.291 | 0.432 | -0.372 | 0.249 | 0.295 |
| RP | 0.290 | 0.432 | -0.364 | 0.251 | 0.272 |
| BH | 0.289 | 0.434 | -0.380 | 0.231 | 0.356 |
| LL | 0.288 | 0.434 | -0.385 | 0.242 | 0.286 |
| CF | 0.303 | 0.435 | -0.328 | 0.265 | 0.222 |
| CL | 0.290 | 0.437 | -0.377 | 0.246 | 0.333 |
| SP | 0.293 | 0.437 | -0.373 | 0.251 | 0.271 |
| EC | 0.295 | 0.442 | -0.363 | 0.253 | 0.265 |
| GA | 0.298 | 0.442 | -0.358 | 0.262 | 0.242 |
| AN | 0.298 | 0.444 | -0.358 | 0.258 | 0.308 |
| SL | 0.300 | 0.444 | -0.358 | 0.261 | 0.211 |
| SC | 0.300 | 0.444 | -0.349 | 0.265 | 0.227 |
| LC | 0.299 | 0.445 | -0.362 | 0.251 | 0.351 |
| AS | 0.300 | 0.445 | -0.352 | 0.271 | 0.284 |
| BB | 0.305 | 0.446 | -0.339 | 0.268 | 0.262 |
| WC | 0.304 | 0.447 | -0.342 | 0.271 | 0.272 |
| JV | 0.304 | 0.448 | -0.346 | 0.264 | 0.285 |
| SD | 0.302 | 0.449 | -0.355 | 0.271 | 0.299 |
| KM | 0.301 | 0.449 | -0.382 | 0.281 | 0.367 |
| RC | 0.303 | 0.450 | -0.351 | 0.272 | 0.265 |
| RL | 0.304 | 0.450 | -0.348 | 0.272 | 0.283 |
| GJ | 0.301 | 0.451 | -0.362 | 0.269 | 0.263 |
| MM | 0.303 | 0.452 | -0.365 | 0.272 | 0.311 |
| GB | 0.307 | 0.453 | -0.347 | 0.277 | 0.242 |
| CN | 0.308 | 0.454 | -0.347 | 0.272 | 0.261 |
| SY | 0.306 | 0.455 | -0.358 | 0.268 | 0.274 |
| MT | 0.303 | 0.456 | -0.372 | 0.276 | 0.324 |
| LM | 0.306 | 0.456 | -0.356 | 0.279 | 0.222 |
| QG | 0.307 | 0.457 | -0.359 | 0.271 | 0.311 |
.inline-btns p {
display: inline;
}
library(downloadthis)
Fst_results <- read.csv("files/Fst.csv")
d_btn <- Fst_results %>%
download_this(
path = system.file("files/", package = "downloadthis"),
output_name = "Fst values",
output_extension = ".csv",
button_label = "Fst",
button_type = "default",
has_icon = TRUE,
icon = "fa fa-save"
)
| X | AN | AS | BB | BH | CC | CF | CL | CN | EC | GA | GB | GJ | JV | KA | KB | KM | LC | LL | LM | MM | MT | QG | RC | RL | RP | SC | SD | SL | SP | SY | TM | WC |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AN | 0.0000000 | 0.0212699 | 0.0200213 | 0.0346840 | 0.0327182 | 0.0267836 | 0.0246469 | 0.0189064 | 0.0190003 | 0.0266201 | 0.0175230 | 0.0164721 | 0.0162117 | 0.0355746 | 0.0303433 | 0.0285037 | 0.0221734 | 0.0338473 | 0.0188846 | 0.0178398 | 0.0170678 | 0.0179127 | 0.0164681 | 0.0181886 | 0.0323123 | 0.0207600 | 0.0178579 | 0.0283760 | 0.0337146 | 0.0169339 | 0.0363946 | 0.0228502 |
| AS | 0.0212699 | 0.0000000 | 0.0186616 | 0.0329252 | 0.0235971 | 0.0224353 | 0.0253128 | 0.0183065 | 0.0245430 | 0.0258348 | 0.0176178 | 0.0197878 | 0.0189161 | 0.0335354 | 0.0296384 | 0.0260247 | 0.0236629 | 0.0337474 | 0.0174792 | 0.0202507 | 0.0204187 | 0.0200585 | 0.0185294 | 0.0197295 | 0.0297079 | 0.0201100 | 0.0185614 | 0.0272561 | 0.0262603 | 0.0216470 | 0.0280640 | 0.0152160 |
| BB | 0.0200213 | 0.0186616 | 0.0000000 | 0.0319076 | 0.0244552 | 0.0214392 | 0.0248414 | 0.0180955 | 0.0231898 | 0.0247534 | 0.0179690 | 0.0187357 | 0.0174954 | 0.0321670 | 0.0286313 | 0.0282892 | 0.0223153 | 0.0331018 | 0.0182653 | 0.0198932 | 0.0202839 | 0.0192087 | 0.0166406 | 0.0183536 | 0.0289344 | 0.0155876 | 0.0196366 | 0.0263040 | 0.0267735 | 0.0195137 | 0.0290556 | 0.0178810 |
| BH | 0.0346840 | 0.0329252 | 0.0319076 | 0.0000000 | 0.0341948 | 0.0348293 | 0.0410418 | 0.0333634 | 0.0331291 | 0.0158469 | 0.0328156 | 0.0339422 | 0.0340365 | 0.0252982 | 0.0261380 | 0.0397149 | 0.0390792 | 0.0295467 | 0.0321425 | 0.0365994 | 0.0370556 | 0.0365161 | 0.0333541 | 0.0299791 | 0.0154639 | 0.0342866 | 0.0354557 | 0.0220385 | 0.0350017 | 0.0367025 | 0.0365534 | 0.0304946 |
| CC | 0.0327182 | 0.0235971 | 0.0244552 | 0.0341948 | 0.0000000 | 0.0237590 | 0.0374692 | 0.0258122 | 0.0321994 | 0.0289741 | 0.0265930 | 0.0314310 | 0.0308357 | 0.0325807 | 0.0299775 | 0.0322526 | 0.0352532 | 0.0345354 | 0.0228768 | 0.0331576 | 0.0342563 | 0.0327005 | 0.0293691 | 0.0297479 | 0.0296424 | 0.0282460 | 0.0303569 | 0.0288153 | 0.0189424 | 0.0340537 | 0.0198847 | 0.0168270 |
| CF | 0.0267836 | 0.0224353 | 0.0214392 | 0.0348293 | 0.0237590 | 0.0000000 | 0.0294367 | 0.0231731 | 0.0279445 | 0.0281877 | 0.0232136 | 0.0245153 | 0.0219338 | 0.0340184 | 0.0315772 | 0.0322944 | 0.0277815 | 0.0364529 | 0.0231211 | 0.0258244 | 0.0272402 | 0.0251380 | 0.0218596 | 0.0236070 | 0.0300860 | 0.0206356 | 0.0260934 | 0.0295962 | 0.0262150 | 0.0268463 | 0.0277910 | 0.0209021 |
| CL | 0.0246469 | 0.0253128 | 0.0248414 | 0.0410418 | 0.0374692 | 0.0294367 | 0.0000000 | 0.0251011 | 0.0299262 | 0.0331434 | 0.0231170 | 0.0231664 | 0.0216017 | 0.0409459 | 0.0380001 | 0.0359048 | 0.0166704 | 0.0425012 | 0.0262426 | 0.0236694 | 0.0238168 | 0.0231343 | 0.0217015 | 0.0238932 | 0.0375295 | 0.0237068 | 0.0242312 | 0.0350436 | 0.0397157 | 0.0242384 | 0.0427709 | 0.0276300 |
| CN | 0.0189064 | 0.0183065 | 0.0180955 | 0.0333634 | 0.0258122 | 0.0231731 | 0.0251011 | 0.0000000 | 0.0229948 | 0.0253191 | 0.0149444 | 0.0173068 | 0.0171130 | 0.0341757 | 0.0295207 | 0.0233035 | 0.0221045 | 0.0332522 | 0.0130664 | 0.0180485 | 0.0172908 | 0.0177681 | 0.0162922 | 0.0178054 | 0.0305035 | 0.0193922 | 0.0175124 | 0.0271138 | 0.0276345 | 0.0174463 | 0.0301545 | 0.0171157 |
| EC | 0.0190003 | 0.0245430 | 0.0231898 | 0.0331291 | 0.0321994 | 0.0279445 | 0.0299262 | 0.0229948 | 0.0000000 | 0.0263762 | 0.0226038 | 0.0214934 | 0.0211447 | 0.0335061 | 0.0287778 | 0.0328450 | 0.0274776 | 0.0324182 | 0.0233297 | 0.0234037 | 0.0230408 | 0.0231056 | 0.0212814 | 0.0224650 | 0.0305677 | 0.0242452 | 0.0236657 | 0.0273561 | 0.0339258 | 0.0228591 | 0.0364258 | 0.0244998 |
| GA | 0.0266201 | 0.0258348 | 0.0247534 | 0.0158469 | 0.0289741 | 0.0281877 | 0.0331434 | 0.0253191 | 0.0263762 | 0.0000000 | 0.0251998 | 0.0256548 | 0.0263371 | 0.0236291 | 0.0221377 | 0.0338229 | 0.0312204 | 0.0257589 | 0.0239398 | 0.0283194 | 0.0280406 | 0.0278849 | 0.0253777 | 0.0224607 | 0.0151134 | 0.0270017 | 0.0268538 | 0.0184739 | 0.0294554 | 0.0275287 | 0.0315750 | 0.0235851 |
| GB | 0.0175230 | 0.0176178 | 0.0179690 | 0.0328156 | 0.0265930 | 0.0232136 | 0.0231170 | 0.0149444 | 0.0226038 | 0.0251998 | 0.0000000 | 0.0167527 | 0.0163814 | 0.0337995 | 0.0289854 | 0.0245593 | 0.0206033 | 0.0324782 | 0.0145730 | 0.0166393 | 0.0159460 | 0.0162265 | 0.0153821 | 0.0171601 | 0.0304754 | 0.0192077 | 0.0164729 | 0.0266545 | 0.0282273 | 0.0161405 | 0.0305152 | 0.0173996 |
| GJ | 0.0164721 | 0.0197878 | 0.0187357 | 0.0339422 | 0.0314310 | 0.0245153 | 0.0231664 | 0.0173068 | 0.0214934 | 0.0256548 | 0.0167527 | 0.0000000 | 0.0151043 | 0.0353594 | 0.0304644 | 0.0271990 | 0.0212521 | 0.0339120 | 0.0176500 | 0.0166582 | 0.0158933 | 0.0165035 | 0.0137941 | 0.0161212 | 0.0314389 | 0.0182391 | 0.0166257 | 0.0276478 | 0.0327586 | 0.0149816 | 0.0354195 | 0.0214097 |
| JV | 0.0162117 | 0.0189161 | 0.0174954 | 0.0340365 | 0.0308357 | 0.0219338 | 0.0216017 | 0.0171130 | 0.0211447 | 0.0263371 | 0.0163814 | 0.0151043 | 0.0000000 | 0.0344323 | 0.0307617 | 0.0287256 | 0.0191716 | 0.0352049 | 0.0189470 | 0.0157196 | 0.0164529 | 0.0153160 | 0.0131920 | 0.0159127 | 0.0308224 | 0.0156749 | 0.0179585 | 0.0279383 | 0.0339841 | 0.0166767 | 0.0363474 | 0.0213632 |
| KA | 0.0355746 | 0.0335354 | 0.0321670 | 0.0252982 | 0.0325807 | 0.0340184 | 0.0409459 | 0.0341757 | 0.0335061 | 0.0236291 | 0.0337995 | 0.0353594 | 0.0344323 | 0.0000000 | 0.0190681 | 0.0414175 | 0.0389326 | 0.0235086 | 0.0329279 | 0.0374017 | 0.0383461 | 0.0366842 | 0.0342726 | 0.0317150 | 0.0193344 | 0.0337279 | 0.0370079 | 0.0175659 | 0.0345171 | 0.0383341 | 0.0358064 | 0.0298833 |
| KB | 0.0303433 | 0.0296384 | 0.0286313 | 0.0261380 | 0.0299775 | 0.0315772 | 0.0380001 | 0.0295207 | 0.0287778 | 0.0221377 | 0.0289854 | 0.0304644 | 0.0307617 | 0.0190681 | 0.0000000 | 0.0368059 | 0.0359200 | 0.0117009 | 0.0277477 | 0.0323038 | 0.0326546 | 0.0324584 | 0.0301154 | 0.0282422 | 0.0217759 | 0.0314428 | 0.0311908 | 0.0167801 | 0.0306686 | 0.0325043 | 0.0325571 | 0.0264465 |
| KM | 0.0285037 | 0.0260247 | 0.0282892 | 0.0397149 | 0.0322526 | 0.0322944 | 0.0359048 | 0.0233035 | 0.0328450 | 0.0338229 | 0.0245593 | 0.0271990 | 0.0287256 | 0.0414175 | 0.0368059 | 0.0000000 | 0.0352685 | 0.0398091 | 0.0221475 | 0.0294939 | 0.0284646 | 0.0302520 | 0.0271601 | 0.0284409 | 0.0384818 | 0.0309566 | 0.0264761 | 0.0357467 | 0.0334697 | 0.0279264 | 0.0353186 | 0.0258690 |
| LC | 0.0221734 | 0.0236629 | 0.0223153 | 0.0390792 | 0.0352532 | 0.0277815 | 0.0166704 | 0.0221045 | 0.0274776 | 0.0312204 | 0.0206033 | 0.0212521 | 0.0191716 | 0.0389326 | 0.0359200 | 0.0352685 | 0.0000000 | 0.0406755 | 0.0236781 | 0.0207346 | 0.0214815 | 0.0193848 | 0.0195040 | 0.0214650 | 0.0355828 | 0.0210133 | 0.0221636 | 0.0324812 | 0.0372541 | 0.0218466 | 0.0410061 | 0.0251646 |
| LL | 0.0338473 | 0.0337474 | 0.0331018 | 0.0295467 | 0.0345354 | 0.0364529 | 0.0425012 | 0.0332522 | 0.0324182 | 0.0257589 | 0.0324782 | 0.0339120 | 0.0352049 | 0.0235086 | 0.0117009 | 0.0398091 | 0.0406755 | 0.0000000 | 0.0309655 | 0.0359579 | 0.0358022 | 0.0360280 | 0.0341083 | 0.0318546 | 0.0261726 | 0.0363706 | 0.0344018 | 0.0211644 | 0.0340164 | 0.0355534 | 0.0361843 | 0.0305224 |
| LM | 0.0188846 | 0.0174792 | 0.0182653 | 0.0321425 | 0.0228768 | 0.0231211 | 0.0262426 | 0.0130664 | 0.0233297 | 0.0239398 | 0.0145730 | 0.0176500 | 0.0189470 | 0.0329279 | 0.0277477 | 0.0221475 | 0.0236781 | 0.0309655 | 0.0000000 | 0.0188936 | 0.0179248 | 0.0188001 | 0.0172917 | 0.0183495 | 0.0295287 | 0.0207955 | 0.0168323 | 0.0256663 | 0.0238840 | 0.0177935 | 0.0264541 | 0.0149584 |
| MM | 0.0178398 | 0.0202507 | 0.0198932 | 0.0365994 | 0.0331576 | 0.0258244 | 0.0236694 | 0.0180485 | 0.0234037 | 0.0283194 | 0.0166393 | 0.0166582 | 0.0157196 | 0.0374017 | 0.0323038 | 0.0294939 | 0.0207346 | 0.0359579 | 0.0188936 | 0.0000000 | 0.0125471 | 0.0124966 | 0.0154447 | 0.0177282 | 0.0337550 | 0.0192789 | 0.0166609 | 0.0293564 | 0.0347920 | 0.0166414 | 0.0375904 | 0.0223864 |
| MT | 0.0170678 | 0.0204187 | 0.0202839 | 0.0370556 | 0.0342563 | 0.0272402 | 0.0238168 | 0.0172908 | 0.0230408 | 0.0280406 | 0.0159460 | 0.0158933 | 0.0164529 | 0.0383461 | 0.0326546 | 0.0284646 | 0.0214815 | 0.0358022 | 0.0179248 | 0.0125471 | 0.0000000 | 0.0125909 | 0.0156450 | 0.0177622 | 0.0349554 | 0.0205679 | 0.0152600 | 0.0297690 | 0.0350223 | 0.0144608 | 0.0381636 | 0.0227158 |
| QG | 0.0179127 | 0.0200585 | 0.0192087 | 0.0365161 | 0.0327005 | 0.0251380 | 0.0231343 | 0.0177681 | 0.0231056 | 0.0278849 | 0.0162265 | 0.0165035 | 0.0153160 | 0.0366842 | 0.0324584 | 0.0302520 | 0.0193848 | 0.0360280 | 0.0188001 | 0.0124966 | 0.0125909 | 0.0000000 | 0.0150762 | 0.0174641 | 0.0333414 | 0.0184967 | 0.0165357 | 0.0293283 | 0.0343993 | 0.0161502 | 0.0377306 | 0.0215739 |
| RC | 0.0164681 | 0.0185294 | 0.0166406 | 0.0333541 | 0.0293691 | 0.0218596 | 0.0217015 | 0.0162922 | 0.0212814 | 0.0253777 | 0.0153821 | 0.0137941 | 0.0131920 | 0.0342726 | 0.0301154 | 0.0271601 | 0.0195040 | 0.0341083 | 0.0172917 | 0.0154447 | 0.0156450 | 0.0150762 | 0.0000000 | 0.0138413 | 0.0306514 | 0.0152993 | 0.0169273 | 0.0274372 | 0.0318366 | 0.0144455 | 0.0345417 | 0.0203048 |
| RL | 0.0181886 | 0.0197295 | 0.0183536 | 0.0299791 | 0.0297479 | 0.0236070 | 0.0238932 | 0.0178054 | 0.0224650 | 0.0224607 | 0.0171601 | 0.0161212 | 0.0159127 | 0.0317150 | 0.0282422 | 0.0284409 | 0.0214650 | 0.0318546 | 0.0183495 | 0.0177282 | 0.0177622 | 0.0174641 | 0.0138413 | 0.0000000 | 0.0275385 | 0.0181671 | 0.0185550 | 0.0253345 | 0.0313771 | 0.0167345 | 0.0345433 | 0.0207855 |
| RP | 0.0323123 | 0.0297079 | 0.0289344 | 0.0154639 | 0.0296424 | 0.0300860 | 0.0375295 | 0.0305035 | 0.0305677 | 0.0151134 | 0.0304754 | 0.0314389 | 0.0308224 | 0.0193344 | 0.0217759 | 0.0384818 | 0.0355828 | 0.0261726 | 0.0295287 | 0.0337550 | 0.0349554 | 0.0333414 | 0.0306514 | 0.0275385 | 0.0000000 | 0.0302712 | 0.0332392 | 0.0177010 | 0.0313996 | 0.0345974 | 0.0331327 | 0.0267652 |
| SC | 0.0207600 | 0.0201100 | 0.0155876 | 0.0342866 | 0.0282460 | 0.0206356 | 0.0237068 | 0.0193922 | 0.0242452 | 0.0270017 | 0.0192077 | 0.0182391 | 0.0156749 | 0.0337279 | 0.0314428 | 0.0309566 | 0.0210133 | 0.0363706 | 0.0207955 | 0.0192789 | 0.0205679 | 0.0184967 | 0.0152993 | 0.0181671 | 0.0302712 | 0.0000000 | 0.0214085 | 0.0286619 | 0.0325139 | 0.0200717 | 0.0347512 | 0.0207936 |
| SD | 0.0178579 | 0.0185614 | 0.0196366 | 0.0354557 | 0.0303569 | 0.0260934 | 0.0242312 | 0.0175124 | 0.0236657 | 0.0268538 | 0.0164729 | 0.0166257 | 0.0179585 | 0.0370079 | 0.0311908 | 0.0264761 | 0.0221636 | 0.0344018 | 0.0168323 | 0.0166609 | 0.0152600 | 0.0165357 | 0.0169273 | 0.0185550 | 0.0332392 | 0.0214085 | 0.0000000 | 0.0288431 | 0.0303812 | 0.0160337 | 0.0328389 | 0.0200819 |
| SL | 0.0283760 | 0.0272561 | 0.0263040 | 0.0220385 | 0.0288153 | 0.0295962 | 0.0350436 | 0.0271138 | 0.0273561 | 0.0184739 | 0.0266545 | 0.0276478 | 0.0279383 | 0.0175659 | 0.0167801 | 0.0357467 | 0.0324812 | 0.0211644 | 0.0256663 | 0.0293564 | 0.0297690 | 0.0293283 | 0.0274372 | 0.0253345 | 0.0177010 | 0.0286619 | 0.0288431 | 0.0000000 | 0.0293650 | 0.0301524 | 0.0314807 | 0.0244301 |
| SP | 0.0337146 | 0.0262603 | 0.0267735 | 0.0350017 | 0.0189424 | 0.0262150 | 0.0397157 | 0.0276345 | 0.0339258 | 0.0294554 | 0.0282273 | 0.0327586 | 0.0339841 | 0.0345171 | 0.0306686 | 0.0334697 | 0.0372541 | 0.0340164 | 0.0238840 | 0.0347920 | 0.0350223 | 0.0343993 | 0.0318366 | 0.0313771 | 0.0313996 | 0.0325139 | 0.0303812 | 0.0293650 | 0.0000000 | 0.0344131 | 0.0174452 | 0.0192507 |
| SY | 0.0169339 | 0.0216470 | 0.0195137 | 0.0367025 | 0.0340537 | 0.0268463 | 0.0242384 | 0.0174463 | 0.0228591 | 0.0275287 | 0.0161405 | 0.0149816 | 0.0166767 | 0.0383341 | 0.0325043 | 0.0279264 | 0.0218466 | 0.0355534 | 0.0177935 | 0.0166414 | 0.0144608 | 0.0161502 | 0.0144455 | 0.0167345 | 0.0345974 | 0.0200717 | 0.0160337 | 0.0301524 | 0.0344131 | 0.0000000 | 0.0375182 | 0.0229745 |
| TM | 0.0363946 | 0.0280640 | 0.0290556 | 0.0365534 | 0.0198847 | 0.0277910 | 0.0427709 | 0.0301545 | 0.0364258 | 0.0315750 | 0.0305152 | 0.0354195 | 0.0363474 | 0.0358064 | 0.0325571 | 0.0353186 | 0.0410061 | 0.0361843 | 0.0264541 | 0.0375904 | 0.0381636 | 0.0377306 | 0.0345417 | 0.0345433 | 0.0331327 | 0.0347512 | 0.0328389 | 0.0314807 | 0.0174452 | 0.0375182 | 0.0000000 | 0.0210962 |
| WC | 0.0228502 | 0.0152160 | 0.0178810 | 0.0304946 | 0.0168270 | 0.0209021 | 0.0276300 | 0.0171157 | 0.0244998 | 0.0235851 | 0.0173996 | 0.0214097 | 0.0213632 | 0.0298833 | 0.0264465 | 0.0258690 | 0.0251646 | 0.0305224 | 0.0149584 | 0.0223864 | 0.0227158 | 0.0215739 | 0.0203048 | 0.0207855 | 0.0267652 | 0.0207936 | 0.0200819 | 0.0244301 | 0.0192507 | 0.0229745 | 0.0210962 | 0.0000000 |
.inline-btns p {
display: inline;
}
library(downloadthis)
Fst_results <- read.csv("files/Fst.csv")
d_btn <- Fst_results %>%
download_this(
path = system.file("/home/caro/Escritorio/BASA_2/basa_Rmd/", package = "downloadthis"),
output_name = "Genetic distances",
output_extension = ".csv",
button_label = "Genetic distances",
button_type = "default",
has_icon = TRUE,
icon = "fa fa-save"
)
| X | Sigma | X. |
|---|---|---|
| Variations Between samples | 278.0909 | 22.07262 |
| Variations Within samples | 981.7995 | 77.92738 |
| Total variations | 1259.8903 | 100.00000 |
#devtools::install_github("dkahle/ggmap")
library(ggmap)
library(devtools)
require(ggplot2)
require(ggsci)
require(ggrepel)
library(data.table)
library(patchwork)
setwd("/home/caro/Escritorio/BASA_2/con_outgroup/Map/")
basacoord <- read.csv('coordinates.csv', header=T)
names(basacoord) <- c('Pop','Lat','Long')
map <- get_stamenmap(bbox = c(left = -120.5, bottom = 37.00, right = -109.0, top = 49.00),
zoom=8,scale = 3,maptype = 'terrain-background')
##zoom controls resolution, 10 takes forever
ggmap(map) + geom_point(data = basacoord, aes(x = Long, y = Lat),size=3,pch=21,fill="white",col="black") +
xlab("Longitude") + ylab("Latitude") +
coord_map(xlim=c(-120.5,-109.0),ylim=c(37.00,49.00)) #selects the range for x and y
group.colors <- c(AN = "#F0F8FF", AS ="#E9967A", BH ="#FFEBCD", BB = "#F08080",CF = "#8B0000",CC ="#FF3030",CN ="#CD6090",CL = "#8B3A62", EC = "#FF82AB",
GA ="#FFC0CB",GB ="#FFE1FF", GJ="#CDB5CD",JV ="#8B7B8B", KA ="#9B30FF",KB="#8470FF",KM ="#27408B",LM ="#87CEFA",
LL="#104E8B",LC="#8DB6CD",MM="#6E7B8B",MT="#FFB90F",QG="#EE7600",RC="#FFF68F",RL="#8B7500",
SL="#2F4F4F",SD="#008B8B",SP="#006400",SC="#BDB76B", SY="#ADFF2F", TM="#00CD66",RP="#00CDCD", WC="#BBFFFF")
map <- ggmap(map) + geom_point(data = basacoord, aes(x = Long, y = Lat),size=2,col="black",fill='white',pch=21) +
geom_label_repel(data = basacoord, aes(x = Long, y = Lat,label = Pop,fill=Pop),
colour = "black", fontface = "bold") +
scale_fill_manual(values=group.colors) +
coord_map(xlim=c(-120.5,-109.0),ylim=c(37.00,49.00)) +
xlab("Longitude") + ylab("Latitude") + theme_bw() +
theme(legend.position = 'none',
axis.text = element_text(size=13),
axis.title = element_text(size = 15, colour="black",face = "bold", vjust = 1),
panel.border = element_rect(size = 1.5, colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
map
ggsave('map_basa.pdf',height=8,width=5)
PCA:
library(data.table)
library(ggplot2)
library(ggsci)
library(umap)
library(LEA)
library(readr)
library(ggpubr)
setwd("/home/caro/Escritorio/BASA_2/con_outgroup/PCA_UMAP/")
##### PCA #####
PCA_gen <- function(df_gen,indv,num=5){ #add ggplot, add tw, add # of output
#pkgTest("ggplot2")
df_gen <- apply(df_gen, 2, function(df) gsub(-1,NA,df,fixed=TRUE))
df_gen <- apply(df_gen, 2, function(df) as.numeric(df))
colmean<-apply(df_gen,2,mean,na.rm=TRUE)
normalize<-matrix(nrow = nrow(df_gen),ncol=ncol(df_gen))
af<-colmean/2
for (m in 1:length(af)){
nr<-df_gen[ ,m]-colmean[m]
dn<-sqrt(af[m]*(1-af[m]))
normalize[ ,m]<-nr/dn
}
normalize[is.na(normalize)]<-0
method1<-prcomp(normalize, scale. = FALSE,center = FALSE)
pve <- summary(method1)$importance[2,]
print(pve[1:50])
if(nrow(df_gen) < num){
num <- nrow(df_gen)
}
pca_X<- method1$x[,1:num]
pca_X <- as.data.frame(pca_X)
pca_X$Pop <- indv$Pop
pca_X$ID <- indv$ID
pca_out <- list("pca_df"=pca_X,"pve"=pve)
#print(PCA_fig(pca_out))
return(pca_out)
}
######################## PCA_fig() ###################
PCA_fig <- function(pca_out,fill="Pop"){ #add PCA setting or normal
xlab <- paste("PCA1 (",round(pca_out$pve[1]*100,2),"%)",sep="")
ylab <- paste("PCA2 (",round(pca_out$pve[2]*100,2),"%)",sep="")
pca_df <- pca_out$pca_df
filler <- as.character(pca_df[[fill]])
ggplot(data = pca_df, aes(x=PC1,y=PC2,fill=filler)) +
geom_point(pch=21,colour='black',size = 3)+
xlab(xlab) + ylab(ylab) +
theme_bw() +
theme(#legend.position = 'none',
axis.text = element_text(size=13),
axis.title = element_text(size = 16, colour="black",face = "bold",vjust = 1),
panel.border = element_rect(size = 1.5, colour = "black"),
legend.text = element_text(size=13),
legend.title = element_text(size = 16, colour="black",face = "bold",vjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
}
######
df012<-fread("basa.012",sep="\t", data.table=F)
df012 <- df012[,-1]
Pop_ID_Sum <- read.csv('Pop_ID_sin.csv')
#############
pca_out <- PCA_gen(df012,Pop_ID_Sum)
pve <- pca_out$pve[1:50]
pve
pca_df <- pca_out$pca_df[,1:50]
pca_df <- cbind(Pop_ID_Sum,pca_df)
write.csv(pca_df,'pca_df_.csv',row.names = FALSE)
##############
pca_df <- read.csv("pca_df.csv")
pve <- c(0.06575, 0.03794, 0.02006, 0.01616, 0.01205)
group.colors <- c(AN = "#F0F8FF", AS ="#E9967A", BH ="#FFEBCD", BB = "#F08080",CF = "#8B0000",CC ="#FF3030",CN ="#CD6090",CL = "#8B3A62", EC = "#FF82AB", GA
="#FFC0CB",GB ="#FFE1FF", GJ="#CDB5CD",JV ="#8B7B8B", KA ="#9B30FF",KB="#8470FF",KM ="#27408B",LM ="#87CEFA",
LL="#104E8B",LC="#8DB6CD",MM="#6E7B8B",MT="#FFB90F",QG="#EE7600",RC="#FFF68F",RL="#8B7500",
SL="#2F4F4F",SD="#008B8B",SP="#006400",SC="#BDB76B", SY="#ADFF2F", TM="#00CD66",RP="#00CDCD", WC="#BBFFFF"#,B.hookeri="#ffff00ff",
#BAH03xB.hybrid="#000000ba")
PCA1VS2 <- ggplot(data = pca_df, aes(x=PC1,y=PC2,fill=as.character(Pop), color=Pop)) +
geom_point(pch=21,colour='black',size = 5) +
xlab(paste("PC",1," (",pve[1]*100,"%)",sep="")) + ylab(paste("PC",2," (",pve[2]*100,"%)",sep=""))+
scale_fill_manual(values=group.colors)+ theme_bw() +
theme(legend.position = 'right',
axis.text = element_text(size=11),
axis.title = element_text(size = 13, colour="black",face = "bold",vjust = 1),
panel.border = element_rect(size = 1.5, colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggsave("pca1_2.pdf",PCA1VS2,height=8,width = 10,units = 'in')
UMAP:
#### UMAP
custom.settings = umap.defaults
custom.settings$min_dist = 0.9
custom.settings$n_neighbors = 14
umap_g <- as.data.frame(umap(df012,config = custom.settings)$layout )
names(umap_g) <- c('layout1','layout2')
umap_g <- cbind(Pop_ID_Sum,umap_g)
#### UMAP gprob ####
umap_g_plot <- ggplot(data = umap_g, aes(x=layout1,y=layout2,fill=as.character(Pop))) +
geom_point(colour='black',size = 5,pch=21) + #ggtitle("UMAP n_neighbors 14 min_dist 0.8") +
xlab('Layout 1') + ylab('Layout 2') +
scale_fill_manual(values=group.colors) +
theme_bw() +
theme(legend.position = 'right',
axis.text = element_text(size=11),
axis.title = element_text(size = 13, colour="black",face = "bold",vjust = 1),
panel.border = element_rect(size = 1.5, colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggsave("umap_09_14.pdf",umap_g_plot,height=8,width = 10,units = 'in')

entropy:
#perl /working/mascaro/acth/entropy2/vcf2mpglV1.3TLP.pl basa.vcf
#perl /working/mascaro/acth/entropy2/gl2genestV1.3.pl basa.mpgl mean
########### PCA_entropy
##row = loci, col = indv
require(readr)
require(MASS)
require(LEA)
require(ggplot2)
PCA_entropy <- function(g){
colmean<-apply(g,2,mean,na.rm=TRUE)
normalize<-matrix(nrow = nrow(g),ncol=ncol(g))
af<-colmean/2
for (m in 1:length(af)){
nr<-g[ ,m]-colmean[m]
dn<-sqrt(af[m]*(1-af[m]))
normalize[ ,m]<-nr/dn
}
normalize[is.na(normalize)]<-0
method1<-prcomp(normalize, scale. = FALSE,center = FALSE)
pve <- summary(method1)$importance[2,]
print(pve[1:5])
pca_df<- method1$x[,1:27]
return(pca_df)
}
require(readr)
require(MASS)
require(LEA)
require(ggplot2)
g <- read.table("pntest_mean_variants_maf5_miss9_thin100_noBadInds.recode.txt", header=F)
Pop_ID_Sum <- read.csv("Pop_ID.csv")
pca_df <- PCA_entropy(t(g))
#Header for entropy:
Pop_ID <- read.csv("Pop_ID.csv")
Sp_Pop <- paste("AT",Pop_ID$Pop,sep="_")
Pop_ID <- paste(Pop_ID$Pop,Pop_ID$ID,sep="_")
Header <- data.frame(Sp_Pop,Pop_ID)
write.table(t(Header),'entropy_header.txt',sep = " ", quote = FALSE,row.names =FALSE)
dim(g)
#[1] loci_individuals
Pop_ID_Sum <- read.csv("Pop_ID_Sum.csv")
pca_df <- PCA_entropy(t(g)) ######change PCA_entropy to allow for selecting PC# output and tw
test
library(MASS)
k2<-kmeans(pca_df[,1:5],2,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k3<-kmeans(pca_df[,1:5],3,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k4<-kmeans(pca_df[,1:5],4,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k5<-kmeans(pca_df[,1:5],5,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k6<-kmeans(pca_df[,1:5],6,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k7<-kmeans(pca_df[,1:5],7,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k8<-kmeans(pca_df[,1:5],8,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k9<-kmeans(pca_df[,1:5],9,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k10<-kmeans(pca_df[,1:5],10,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
ldak2<-lda(x=pca_df[,1:5],grouping=k2$cluster,CV=TRUE)
ldak3<-lda(x=pca_df[,1:5],grouping=k3$cluster,CV=TRUE)
ldak4<-lda(x=pca_df[,1:5],grouping=k4$cluster,CV=TRUE)
ldak5<-lda(x=pca_df[,1:5],grouping=k5$cluster,CV=TRUE)
ldak6<-lda(x=pca_df[,1:5],grouping=k6$cluster,CV=TRUE)
ldak7<-lda(x=pca_df[,1:5],grouping=k7$cluster,CV=TRUE)
ldak8<-lda(x=pca_df[,1:5],grouping=k8$cluster,CV=TRUE)
ldak9<-lda(x=pca_df[,1:5],grouping=k9$cluster,CV=TRUE)
ldak10<-lda(x=pca_df[,1:5],grouping=k10$cluster,CV=TRUE)
write.table(round(ldak2$posterior,5),file="ldak2.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak3$posterior,5),file="ldak3.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak4$posterior,5),file="ldak4.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak5$posterior,5),file="ldak5.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak6$posterior,5),file="ldak6.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak7$posterior,5),file="ldak7.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak8$posterior,5),file="ldak8.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak9$posterior,5),file="ldak9.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak10$posterior,5),file="ldak10.txt",quote=F,row.names=F,col.names=F)
#cat entropy_header.txt basa.mpgl > good_snps.entropy.mpgl
## add 584 36161 1 to the top of entropy_header.txt y dejas esa linea vacia! mirar el ejemplo
mio#########
entropy -i basa_entropy.mpgl -o basa_k2.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 2 -q
ldak2.txt -m 1 -w 0 &> k2stdout.txt
entropy -i basa_entropy.mpgl -o basa_k3.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 3 -q
ldak2.txt -m 1 -w 0 &> k3stdout.txt
entropy -i basa_entropy.mpgl -o basa_k4.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 4 -q
ldak4.txt -m 1 -w 0 &> k4stdout.txt
entropy -i basa_entropy.mpgl -o basa_k5.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 5 -q
ldak5.txt -m 1 -w 0 &> k5stdout.txt
entropy -i basa_entropy.mpgl -o basa_k6.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 6 -q
ldak6.txt -m 1 -w 0 &> k6stdout.txt
entropy -i basa_entropy.mpgl -o basa_k7.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 7 -q
ldak7.txt -m 1 -w 0 &> k7stdout.txt
entropy -i basa_entropy.mpgl -o basa_k8.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 8 -q
ldak8.txt -m 1 -w 0 &> k8stdout.txt
entropy -i basa_entropy.mpgl -o basa_k9.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 9 -q
ldak9.txt -m 1 -w 0 &> k9stdout.txt
entropy -i basa_entropy.mpgl -o basa_k10.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 10 -q
ldak10.txt -m 1 -w 0 &>
k10stdout.txt
### DIC value
module load entropy/1.2
###Get the DICs values for each K value:
estpost.entropy basa_k2.hdf5 -s 3 -p deviance > DIC_k2.txt
estpost.entropy basa_k3.hdf5 -s 3 -p deviance > DIC_k3.txt
estpost.entropy basa_k4.hdf5 -s 3 -p deviance > DIC_k4.txt
estpost.entropy basa_k5.hdf5 -s 3 -p deviance > DIC_k5.txt
estpost.entropy basa_k6.hdf5 -s 3 -p deviance > DIC_k6.txt
estpost.entropy basa_k7.hdf5 -s 3 -p deviance > DIC_k7.txt
estpost.entropy basa_k8.hdf5 -s 3 -p deviance > DIC_k8.txt
estpost.entropy basa_k9.hdf5 -s 3 -p deviance > DIC_k9.txt
estpost.entropy basa_k10.hdf5 -s 3 -p deviance > DIC_k10.txt
###Generate entropy q files:
estpost.entropy basa_k2.hdf5 -p q -s 0 -o q2.txt
estpost.entropy basa_k3.hdf5 -p q -s 0 -o q3.txt
estpost.entropy basa_k4.hdf5 -p q -s 0 -o q4.txt
estpost.entropy basa_k5.hdf5 -p q -s 0 -o q5.txt
estpost.entropy basa_k6.hdf5 -p q -s 0 -o q6.txt
estpost.entropy basa_k7.hdf5 -p q -s 0 -o q7.txt
estpost.entropy basa_k8.hdf5 -p q -s 0 -o q8.txt
estpost.entropy basa_k9.hdf5 -p q -s 0 -o q9.txt
estpost.entropy basa_k10.hdf5 -p q -s 0 -o q10.txt
###Generate gprobs file:
estpost.entropy basa_k2.hdf5 -p gprob -s 0 -o gprob.txt
# plotting
library(ggplot2)
library(forcats)
library(ggthemes)
library(patchwork)
kdf2 <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/entropy/definitivo/q2.csv"
)
k2plot <-
ggplot(kdf2, aes(factor(sampleID), prob, fill = factor(popGroup))) +
geom_col(color = "gray", size = 0.1) +
facet_grid(~fct_inorder(loc), switch = "x", scales = "free", space = "free") +
theme_minimal() + labs(x = "Populations", title = "K=2", y = "Ancestry") +
scale_y_continuous(expand = c(0, 0)) +
scale_x_discrete(expand = expansion(add = 1)) +
theme(
panel.spacing.x = unit(0.1, "lines"),
axis.text.x = element_blank(),
panel.grid = element_blank()
) +
scale_fill_gdocs(guide = FALSE)
k2plot
ggsave("k2.pdf",k3plot,height=8,width = 20,units = 'in')
Admixture piecharts:
## Admixture pie chart
BiocManager::install("LEA")
# Load packages
library(adegenet)
library(poppr)
library(LEA)
library(reshape2)
library(dplyr)
library(ggplot2)
library(rworldmap)
library(rworldxtra)
library(ggsn)
library(sf)
library(raster)
library(rgeos)
library(maps)
library(maptools)
library(grid)
library(miscTools)
library(stringr)
library(ggpubr)
qmatrix<-read.csv("Desktop/q4.csv",header=TRUE, sep = ";")
qmatrix
# Label column names of qmatrix
ncol(qmatrix)
head(qmatrix)
qlong = melt(qmatrix, id.vars=c("ID","Pop"))
head(qlong)
pal = colorRampPalette(c("red","blue", "green", "light blue"))
cols = pal(length(unique(qlong$variable)))
admix.bar = ggplot(data=qlong, aes(x=ID, y=value, fill=variable))+
geom_bar(stat = "identity")+
scale_y_continuous(expand = c(0,0))+
facet_wrap(~Pop, scales = "free", ncol = 2)+
scale_fill_manual(values = cols)+
ylab("Admixture proportion")+
# xlab("Individual")+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank(),
strip.text = element_text(colour="black", size=12),
panel.grid = element_blank(),
panel.background = element_blank(),
legend.position = "top",
legend.title = element_blank(),
legend.text = element_text(size = 12))
admix.bar
ggsave("Desktop/1.admixture_barplot.png", width=6, height=10, dpi=300)
# Calculate mean admixture proportions for each site
head(qmatrix)
clusters = grep("K", names(qmatrix)) # indexes of cluster columns
avg_admix = aggregate(qmatrix[, clusters], list(qmatrix$Pop), mean)
# Order alphabetically by site
avg_admix = avg_admix[order(as.character(avg_admix$Group.1)), ]
avg_admix
# Convert dataframe from wide to long format
avg_admix = melt(avg_admix, id.vars = "Group.1")
head(avg_admix)
# Define a function to plot pie charts using ggplot for each site
pie_charts = function(admix_df, site, cols){
# admix_df = dataframe in long format of admixture proportions per site
# site = string
# cols = vector of colours of length(clusters)
ggplot(data = subset(admix_df, Group.1 == site),
aes(x = "", y = value, fill = variable))+
geom_bar(width = 1, stat = "identity", colour = "black", show.legend = FALSE)+
coord_polar(theta = "y")+
scale_fill_manual(values = cols)+
theme_void()
}
# Test function on one site
pie_charts(avg_admix, site = "AN", cols = cols)
# Subset data to reduce computation time
subsites = sort(c("AN", "AS", "BB", "BH", "CC", "CF", "CL", "CN", "EC", "GA", "GB", "GJ",
"JV","KA", "KB", "KM", "LC", "LL", "LM", "MM", "MT", "QG", "RC", "RL", "RP",
"SC", "SD", "SL",
"SP", "SY", "TM", "WC"))
# Apply function to all sites using for loop
pies = list()
for (i in subsites){
pies[[i]] = pie_charts(admix_df = avg_admix, site = i, cols = cols)
}
# Prepare basemap
# Import csv file containing coordinates
coords = read.csv("Desktop/coordinates.csv", sep = ";")
# Subset coordinates
coords = coords[coords$Pop %in% subsites, ]
# Order alphabetically by site
coords = coords[order(coords$Pop), ]
coords
# Check order matches coords order
as.character(avg_admix$Group.1) == as.character(coords$Pop)
# Set map boundary (xmin, xmax, ymin, ymax)
boundary = extent(-120.5, -109, 37, 49)
boundary
# Get map outlines from rworldmap package
map.outline = getMap(resolution = "high")
# Crop to boundary and convert to dataframe
map.outline = crop(map.outline, y = boundary) %>% fortify()
# Plot basemap
basemap = ggplot()+
geom_polygon(data=map.outline, aes(x=long, y=lat, group=group), fill="grey",
colour="black", size=0.5)+
coord_quickmap(expand=F)+
#lggsn::north(map.outline, symbol = 10, scale = 0.06, location = "topleft")+
#lggsn::scalebar(data = map.outline, dist = 50, dist_unit = "km", height = 0.01,
#ltransform = TRUE, model = "WGS84" )+
#location = "bottomleft", anchor = c(x = -114, y = 38),
#st.bottom = FALSE, st.size = 4, st.dist = 0.015)+
xlab("Longitude")+
ylab("Latitude")+
theme(
axis.text = element_text(colour="black", size=12),
axis.title = element_text(colour="black", size=14),
panel.background = element_rect(fill="lightsteelblue2"),
panel.border = element_rect(fill = NA, colour = "black", size = 0.5),
panel.grid.minor = element_line(colour="grey90", size=0.5),
panel.grid.major = element_line(colour="grey90", size=0.5),
legend.text = element_text(size=12),
legend.title = element_blank(),
legend.key.size = unit(0.7, "cm"),
legend.position = "top"
)
basemap
coord.list = list()
for (i in subsites){
coord.list[[i]] = c(subset(coords, Pop == i)$Lon, subset(coords, Pop == i)$Lat)
}
coord.list
# Define pie chart sizes
radius = 0.45
# Convert ggplot pie charts to annotation_custom layers
pies.ac = list()
for (i in 1:length(subsites)){
pies.ac[[i]] = annotation_custom(grob = ggplotGrob(pies[[i]]),
xmin = coord.list[[i]][[1]] - radius,
xmax = coord.list[[i]][[1]] + radius,
ymin = coord.list[[i]][[2]] - radius,
ymax = coord.list[[i]][[2]] + radius)
}
# Add layers to basemap
pie.map = basemap + pies.ac
pie.map
ggsave("Desktop/4.pie_charts_map.png", width = 8, height = 10, dpi = 300)
# Combine ggplots
ggarrange(admix.bar + theme(legend.position = "right") + labs(title = "Individual admixture proportions", tag = "A"),
pie.map + labs(title = "Mean admixture proportions for each site", tag = "B"))
ggsave("Desktop/4.Admixture_bar_map.png", width = 30, height = 10, dpi = 300)
EEMS, unPC:
#EEMS
diffs <- read.table("./test/example-SNP-major-mode.diffs")
diffs
datapath = ./data/inputdata
mcmcpath = ./data/outputdata
nIndiv = 246
nSites = 5677
nDemes = 300
diploid = false
numMCMCIter = 200000000
numBurnIter = 100000000
numThinIter = 999999
./runeems_snps --params configurationfile.ini --seed 123 (randome seed, optional)
./runeems_snps --params configurationfile2.ini --seed 123
./runeems_snps --params configurationfile3.ini --seed 123
EEMS results visualization:
library(rEEMSplots)
mcmcpath <- c("./eems_output/output_1", "./eems_output/output_2", "./eems_output/output_3")
plotpath = (""./eems_output/plots"")
#unPC
library(unPC)
unPC(inputToProcess = "pca_eigen.evec", outputPrefix = "unPC",
runDataImport = TRUE, runPairwiseCalc = TRUE, geogrCoords= "Lon_Lat.txt",
roundEarth = TRUE, firstPC = 1, secondPC = 2, runPlotting = TRUE,
applyManualColor = FALSE, geogrCoordsForPlotting = "Lon_Lat_popnAggregated.txt",
plotWithMap = FALSE,
ellipseWidth = 0.05, populationPointNormalization = 7,
runAggregated = TRUE, savePlotsToPdf = TRUE)
unpcout <- readRDS("unPC_pairwiseDistCalc_unPC.rds")
unPC <- unpcout$ratioPCToGeogrDist
write.csv(unpcout, "unPCscores.csv")
pRDA:
############## RDA
library(adegenet)
library(poppr)
library(tidyverse)
library(vcfR)
library(hierfstat)
library(pegas)
library(magrittr)
library(ape)
library(psych)
library(adespatial)
library(vegan)
## Loading the genetic data
setwd("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/")
##setwd("/home/caro/Escritorio/")
gen <- read.csv("basa_prda.012", row.names=1)
##gen <- read.csv("acth_012.csv", row.names=1)
dim(gen)
sum(is.na(gen))
gen.imp <- apply(gen, 2, function(x) replace(x, is.na(x), as.numeric(names(which.max(table(x))))))
## Env. variables
Env <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/Env.csv", header = TRUE)
Coordinates <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/Lon_Lat.csv")
PopStruct <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/pca_df_.csv")
## Table gathering all variables
Variables <- data.frame(Env, Coordinates, PopStruct)
Variables[1:5,]
## Null model
RDA0 <- rda(gen.imp ~ 1, Variables)
## Full model
RDAfull <- rda(gen.imp ~ ppt+tdmean+tmax+tmean+tmin+vpdmax+vpdmin+slope+Af+soilawc+sloprad+afrad+heatload+CumlWs+DurCWD+OnsetCWD+CumlAET+CumlCWD+CumlPET+WsAE
Tspr+FallAET+MaxCWD+pcseas+CumlGDD+AETgdd+DurGDD+mxtmpfal+mxtmpspr+mxtmpsum+mxtmpwin+mntmpfal+ mntmpspr+
mntmpsu+mntmpwin+prcpfal+prcpspr+prcpsum+prcpwin+Elevation+ mndtmpfal+mndtmpspr+mndtmpsu+mndtmpwin+Lon+Lat+PC1+PC2, Variables)
#Quito esta: +DeclAET y quito los guiones bajos de ID
## "To conduct the selection procedure we used the ordiR2step function of the package vegan and the following stopping criteria: variable significance of p <
0.01 using 1000 permutations, and the adjusted R2 of the global model."
mod <- ordiR2step(RDA0, RDAfull, Pin = 0.01, R2permutations = 1000, R2scope = T, direction = T)
mod$anova
## Removing correlated predictors:
env <- subset(Variables, select=c(tmin,vpdmin,sloprad,prcpfal,mndtmpfal,tdmean,mxtmpfal,mntmpsu,mxtmpsum,mndtmpsu,mndtmpwin,AETgdd,CumlAET,soilawc,prcpwin,Cu
mlCWD,mntmpfal,MaxCWD,CumlGDD,tmax,tmean,pcseas,afrad,mntmpspr,OnsetCWD,mndtmpspr,CumlWs,mntmpwin,DurCWD))
str(env)
correlation <-cor(env, method = "pearson")
write.csv(correlation, "/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/correlation1.csv")
env2 <- subset(Variables,
select=c(sloprad,prcpfal,mndtmpfal,tdmean,mxtmpfal,mntmpsu,mxtmpsum,soilawc,prcp
win,CumlCWD,MaxCWD,CumlGDD,tmax,pcseas,afrad,OnsetCWD,CumlWs))
correlation2 <-cor(env2, method = "pearson")
write.csv(correlation2, "/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/correlation2.c
sv")
env3 <- subset(Variables,
select=c(sloprad,tdmean,mxtmpfal,mxtmpsum,soilawc,prcpwin,tmax,pcseas,afrad,Onse
tCWD,CumlWs))
pairs.panels(env3, scale = TRUE, density =TRUE, ellipses =TRUE, methods="pearson")
pdf("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/Pairs_pannel3.pdf")
#######
####### Selected: sloprad,tdmean,mxtmpfal,mxtmpsum,soilawc,prcpwin,tmax,pcseas,afrad,OnsetCWD,CumlWs
## Full model
pRDAfull <- rda(gen.imp ~ PC1 + PC2+ + Lon + Lat+ sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs, Variables)
pRDAfull
#RsquareAdj(pRDAfull)
#anova(pRDAfull)
## Pure climate model
pRDAclim <- rda(gen.imp ~ sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs + Condition(PC1 + PC2+ Lon + Lat),Variables)
pRDAclim
#RsquareAdj(pRDAclim)
#anova(pRDAclim)
# Model summaries
#RsquareAdj(pRDAclim) # adjusted Rsquared
#The eigenvalues for the constrained axes reflect the variance explained by each canonical axis:
#summary(pRDAclim, display=NULL) #display = NULL optional
#summary(eigenvals(pRDAclim, model = "constrained"))
#screeplot(pRDAclim)
#check our RDA model for significance using formal tests. We can assess both the full model and each constrained axis using F-statistics (Legendre et al,
2010). The null hypothesis is that no linear relationship exists between the SNP data and the environmental predictors.
#anova.cca(pRDAclim, permutations = 100) # full model
#anova.cca(pRDAclim, permutations = 100, by="margin") # per variable
#signif.axis <- anova.cca(pRDAclim, by="axis", parallel=getOption("mc.cores"))
#signif.axis
#vif.cca(pRDAclim) # variance inflation factor (<10 OK)
# Model: rda(formula = gen.imp ~ AETgdd + AfRad + FallAET + MaxCWD + mxtmpwin + prcpsum + SlopRad + WsAETspr + Condition(PC1 + PC2 + Lon + Lat), data =
Variables)
# Df Variance F Pr(>F)
# AETgdd 1 11.505 15.742 0.009901 **
# AfRad 1 17.800 24.356 0.009901 **
# FallAET 1 14.477 19.808 0.009901 **
# MaxCWD 1 21.014 28.754 0.009901 **
# mxtmpwin 1 15.724 21.516 0.009901 **
# prcpsum 1 16.850 23.056 0.009901 **
# SlopRad 1 13.357 18.276 0.009901 **
# WsAETspr 1 15.852 21.691 0.009901 **
## Pure neutral population structure model
pRDAstruct <- rda(gen.imp ~ PC1 + PC2 + Condition(Lon + Lat + sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs),
Variables)
pRDAstruct
#RsquareAdj(pRDAstruct)
#anova(pRDAstruct)
##Pure geography model
pRDAgeog <- rda(gen.imp ~ Lon + Lat + Condition(sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs + PC1 + PC2), Variables)
pRDAgeog
#RsquareAdj(pRDAgeog)
#anova(pRDAgeog)
| Partial.RDA.models | Inertia | R2 | P…..F.. | Proportion.of.explainable.variance | Proportion.of.total.variance |
|---|---|---|---|---|---|
| Full model: gen ~ env + structur + geog | 30.45 | 0.09 | 0.001*** | 1.000 | 0.090 |
| Pure climate: gen ~ env (structur + geog) | 14.71 | 0.04 | 0.001*** | 0.483 | 0.043 |
| Pure structure: gen ~ structur (env + geog) | 6.04 | 0.02 | 0.001*** | 0.182 | 0.016 |
| Pure geography: gen ~ geog (env + structur) | 3.13 | 0.01 | 0.001*** | 0.115 | 0.010 |
| NA | NA | NA | NA | ||
| Counfounded climate/structure/geography | 6.57 | NA | 0.233 | 0.021 | |
| NA | NA | NA | NA | ||
| Total unexplained | 298.20 | NA | NA | NA | |
| Total inertia | 328.65 | NA | NA | NA |
Phylogeny IQtree:
Based on the phylogeny, B.
hookeri and B. hybrid are embedded within B. sagittata.
### vcf to nexus (binary) for PhyloNet
python vcf2phylip -i basa_outg.vcf -o LW -b
./iqtree -s basa.phy -nt 10 -m GTR+ASC -bb 1000
Fbranch:
Based on Fbranch results (very
conservative parameters) there is gene flow among some B. sagittata
populations and B. hookeri and B. hybrid. There is also some ancestral
gene flow among B. sagittata and basal clades of B. hookeri and B.
hybrid.
./Dsuite Dtrios basa.vcf SETS.txt -t tree_trimmed.tree
./Dsuite Fbranch -Z --Pb-matrix tree_trimmed.tree SETS_tree.txt > fbranch_collapsed_p.txt
head -n 79 fbranch_collapsed_p.txt > f_plotting.txt
#read in z-scores
zscores <- read.csv("f_plotting.txt", sep = "\t")
zscores[zscores < 0.05] <- 0
write.table(zscores, "f_5_percent_plotting_trimmed.txt", sep='\t',row.names=F, quote=F)
./dtools.py f_5_percent_plotting_trimmed.txt tree_trimmed.tree --outgroup LW_ME_2 --ladderize
PhyloNet:
Two reticulations the most optimal.
### Trimming the vcf:
vcftools --vcf basa_final.vcf --remove remove.txt (a single individual per population, except for the outgroups)
Running PhyloNet from 0 to 10 networks:
# nexus file, each sample with a single letter different ID
# First 0 network, then 1 network, 2 etc.. based on the former network (see example file), 10 iterations
java -jar PhyloNet.jar phylonet_todos0.nex > results.txt
#### G.distances
##python vcf2phylip.py -i basa_final.vcf -fasta
library(reshape)
library(dplyr)
library(gdata)
library(ape)
library(ggplot2)
library(ggrepel)
library(ggExtra)
dna_aligned<-read.FASTA("basa_final.min4.fasta", type="DNA")
dna_distances<-as.matrix(dist.dna(dna_aligned, model="raw",pairwise.deletion=T)*100)
write.csv(dna_distances, "pairwise_distances.csv")
diag(dna_distances)<- NA
data<- melt(dna_distances)
p<-ggplot(data = data, aes(x = X2, y = value, label = X1)) +
geom_point() + scale_y_continuous(breaks=seq(0,5,1)) +
xlab('Taxa')+ ylab("Genetic distance % (raw)")+theme_classic()+theme(axis.text.x = element_text(angle =
90,hjust=0.65,vjust=0.1),panel.background = element_rect(fill = NA),
panel.grid.major = element_line(colour = "lightgray"),panel.grid.major.y=element_line(size = NA),
panel.ontop = F) + geom_hline(yintercept=3, linetype="dashed", color ="gray")
p1 <-ggMarginal(p, margins = "y", type="histogram")
ggsave(plot=p1,"genetic_distances.pdf", width=30,height=15)